#!/usr/bin/python
# -*- coding: utf-8 -*-

from mongodb import mongodb
import re
import time

class AlignPublish_time:

    def __init__ (self):
        dbcl = mongodb()
        self.db = dbcl.get_db()

    def aligntimeformat (self):
        rsts = self.db.newsv1.find({'publish_time':{'$ne':None}},{'_id':1,'publish_time':1})
        
        count = 0
        for rst in rsts:
            #print rst
            publishtimeStamp = rst['publish_time']
            if isinstance(publishtimeStamp,float):
                continue
            publishtimeStamp = publishtimeStamp.strip("\n").strip("</p>")
            if not self.isfloat(publishtimeStamp):
                count += 1
                publishtimeStamp = self.string2Timestamp(publishtimeStamp)
                self.db.newsv1.update({'_id':rst["_id"]},{'$set':{'publish_time':publishtimeStamp}})
                if count%1000 == 0:
                    print ("Processed "+ str(count)+" news")
                
    def string2Timestamp(self, TimeString):
        if(len(TimeString) == 0):
            return 0
        #查找是否含有中文
        chinese = re.compile(u"[\u4e00-\u9fa5]")
        if chinese.findall(TimeString):
            timeTemp = chinese.sub('-',TimeString)
            timeTemp=timeTemp[:timeTemp.rfind('-')]+timeTemp[timeTemp.rfind('-')+1:]
        else:
            timeTemp=TimeString
        #查找 ：
        r=re.compile(r':')
        if timeTemp.find(":") == -1 and len(timeTemp.split(" "))==1:
            timeTemp += " 00"
        
        while len(r.findall(timeTemp))<2:
            timeTemp += ':00'
        return  time.mktime(time.strptime(timeTemp, "%Y-%m-%d %H:%M:%S"))
        
    def isfloat (self,publishTime):
        try:
            float(publishtime)
            return 1
        except:
            return 0

if __name__=="__main__":
    alignpublish_time = AlignPublish_time()
    alignpublish_time.aligntimeformat()
